import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import plotly.graph_objs as go
import bs4
import requests
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import ExtraTreesRegressor
dt=pd.read_csv("https://raw.githubusercontent.com/sundeepblue/movie_rating_prediction/master/movie_metadata.csv")
First we will look at the structure of the data and interpret the meaning of feature columns, and then evaluate some problems in the data. Data includes 5043 rows of different movies and 28 columns that are the information related to the movies.
There is some data problems that we need to handle:
''' Overview data '''
print(dt.info())
print('\nNumber of column type:\n', dt.dtypes.value_counts())
''' Missing values'''
# Function to calculate missing values by column
def missing_values_table(df):
# Total missing values
mis_val = df.isnull().sum()
# Percentage of missing values
mis_val_percent = 100 * df.isnull().sum() / len(df)
# Make a table with the results
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
# Rename the columns
mis_val_table_ren_columns = mis_val_table.rename(
columns = {0 : 'Missing Values', 1 : '% of Total Values'})
# Sort the table by percentage of missing descending
mis_val_table_ren_columns = mis_val_table_ren_columns[
mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
'% of Total Values', ascending=False).round(1)
# Print some summary information
print ("\nDataframe has " + str(df.shape[1]) + " columns.\n"
"There are " + str(mis_val_table_ren_columns.shape[0]) +
" columns that have missing values.\n", mis_val_table_ren_columns)
# Return the dataframe with missing information
return mis_val_table_ren_columns
# Missing value
misssing_col= missing_values_table(dt)
''' Check duplicate'''
def dup_remove(data):
print('Remove', dt.duplicated().sum(),'dupicates')
return data.drop_duplicates()
dt= dup_remove(dt)
# Number of unique classes in each object column
print('\n Number of unique value:\n', dt.apply(pd.Series.nunique, axis = 0).sort_values())
def unique_count_plt(data, col_list):
'''
Bar plot of unique values. Also show number of missing values to compare.
data: dataframe
col_list: list column name want to plot
'''
for col in col_list:
count= data[col].value_counts(dropna= False)
percent= count*100/count.sum()
percent.plot.bar(figsize=(15,7)) # series function plot bar
plt.title('% unique values in '+ col+ ' column')
plt.show()
unique_count_plt(dt, ['color','content_rating','country', 'language','facenumber_in_poster','aspect_ratio','title_year'])
def box_plt_category(data, col_list, target_col):
'''
Graph box plot of each column in descending order of median
data: pandas data frame
col_list: list of column name that want to be show on x_axis
target_col: column name of target variable that want to be show on y_axis
'''
for col in col_list:
grouped = data.loc[:,[col, target_col]].groupby([col]).median().sort_values(by= target_col, ascending= False)
plt.figure(figsize=(20, 6))
chart= sns.boxplot(x=col, y=target_col, data=data, order=grouped.index)
#chart= sns.violinplot(x=col, y=target_col, data=data, order=grouped.index)
#sns.stripplot(x=col, y=target_col, data=data, order=grouped.index)
chart.set_xticklabels(chart.get_xticklabels(), rotation=90, horizontalalignment='right')
chart.set_title('Box plot of ' + col, fontsize=16)
box_plt_category(dt, ['color','content_rating', 'country', 'language','facenumber_in_poster','title_year','aspect_ratio'], 'imdb_score')
The result of checking columns that has the number of unique values < 100 (except target column imdb_score):
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
def scatter_plt_numerical(data, col_list, target_col, col_movie_title):
'''
Scatter plot
data: pandas data frame
col_list: list of column name that want to be show on y_axis
target_col: column name of target variable that want to be show on x_axis
'''
for col in col_list:
dict_of_fig = dict({
"data": [{'y': data[target_col] ,
'x': data[col] ,
'mode': 'markers', # marker= scatter
'marker': dict(size=8, color=data.movie_facebook_likes, colorbar= dict(title="movie_facebook_likes"), colorscale='Viridis', showscale=True),
"text" : data[col_movie_title]
}],
"layout": {"title": {"text": "Scatter plot between Imdb_score and " + col},
"xaxis_title":{"text": col },
"yaxis_title":{"text": "imdb_score" },
}
})
fig = go.Figure(dict_of_fig)
fig.show()
list_col= ['director_facebook_likes','num_critic_for_reviews','duration', 'actor_1_facebook_likes', 'actor_2_facebook_likes',\
'actor_3_facebook_likes','cast_total_facebook_likes','gross','num_voted_users',\
'num_user_for_reviews','budget']
scatter_plt_numerical(dt, list_col, 'imdb_score','movie_title')
# color= ['lighseagreen','pink','red','blue','lavender','papayawhip','yellow','grey','cyan','orange','orchid']
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
def violin_plot(data, col_list):
'''
Violin plot: plot density distribution of continuous variables
'''
for col in col_list:
fig = go.Figure(go.Violin(y = data[col],box_visible=True,line_color='black',\
fillcolor='lightseagreen', opacity=0.6,meanline_visible = True, x0=col))
fig.show()
list_col= ['movie_facebook_likes','director_facebook_likes','num_critic_for_reviews','duration', 'actor_1_facebook_likes',\
'actor_2_facebook_likes','actor_3_facebook_likes','cast_total_facebook_likes','gross','num_voted_users',\
'num_user_for_reviews','budget']
violin_plot(dt, list_col)
The result of checking columns that has the number of unique values > 100:
'''Histogram plot of target vairable: imbd_score'''
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.figure_factory as ff
x = dt['imdb_score'].values
hist_data = [x]
group_labels = ['imdb_score']
fig = ff.create_distplot(hist_data, group_labels, colors= ['grey'], bin_size=.1)
fig.layout.update(title='Histogram and desity plot of Imdb_score')
fig.show()
The greatness of a movie is highly affected by its director, so I will add a new column that can measure this property.
''' Do web scraping to get canne director award from wikipedia '''
res1= requests.get('https://en.wikipedia.org/wiki/Palme_d%27Or')
soup1= bs4.BeautifulSoup(res1.content)
table_rows1= soup1.select('table')[1].select('tr')[3:]
cell_values1=[]
for i in range(len(table_rows1)-1):
cell_num= table_rows1[i].select('td')
if len(cell_num) <=2:
continue
elif len(cell_num) ==3:
v= table_rows1[i].select('td')[1].select('a')[0].text
cell_values1.append(v)
elif len(cell_num) == 4:
v= table_rows1[i].select('td')[2].select('a')[0].text
cell_values1.append(v)
else:
v= table_rows1[i].select('td')[3].select('a')[0].text
cell_values1.append(v)
canne_director= pd.DataFrame(cell_values1, columns=['Director'])
canne_director.to_csv('canne_directors.csv')
''' Do web scraping to get oscar director award from wikipedia '''
res2= requests.get('https://en.wikipedia.org/wiki/List_of_Academy_Award_for_Best_Director_winners_by_age')
soup2= bs4.BeautifulSoup(res2.content)
table_rows2= soup2.select('table')[0].select('tr')[1:]
cell_values2=[]
for i in range(len(table_rows2)-1):
v= table_rows2[i].select('td')[1].select('a')[0].text
cell_values2.append(v)
oscar_director= pd.DataFrame(cell_values2, columns=['Director'])
oscar_director.to_csv('canne_directors.csv')
''' Combine unique oscar and canne director names. Create director_aw column with 1 for receiving award and 0 for non award '''
director_award = oscar_director.append(canne_director, ignore_index=True)['Director'].unique()
dt['director_aw']= np.where(dt.director_name.isin(director_award),1,0)
''' imdb_score and number of directors received award'''
dt.groupby('imdb_score')['director_aw'].sum().plot(kind='bar',\
title='imdb_score and number of directors received award',\
figsize=(15,7))
sns.boxplot(x='director_aw', y='imdb_score', data=dt).set_title('Box plot of director award ')
Generally, directors that received canne/oscar award have higher ibdb score
We will group these content ratings into 7 main groups
dt1= dt.copy()
# Adjust content_rating
dt1.loc[dt1.content_rating=='TV-MA','content_rating']= 'R'
dt1.loc[dt1.content_rating=='TV-14','content_rating']= 'PG-13'
dt1.loc[(dt1.content_rating=='TV-G') | (dt1.content_rating=='TV-Y') | (dt1.content_rating=='TV-Y7'),'content_rating']='G'
dt1.loc[(dt1.content_rating=='TV-PG') | (dt1.content_rating=='M') | (dt1.content_rating=='GP'),'content_rating']='PG'
dt1.loc[dt1.content_rating=='X','content_rating']='NC-17'
# which genres bring the most gross profit (G,PG,and PG-13)
dt1.groupby('content_rating')['gross'].mean().sort_values(ascending=False).plot(y='Gross Earning',kind='bar',\
title='Gross earning by Genres',figsize=(10,6))
content_l = ['R','PG','PG-13','G','NC-17',np.nan]
dt1['content_rating'] = np.where(~(dt1.content_rating.isin(content_l)), 'others', dt1.content_rating)
dt1.content_rating.unique()
G, PG and PG-13 are genres that often bring higher profits
Eng_l= ['USA','UK','Canada', 'Australia','New Zealand']
dt1.loc[(dt1.language.isnull()) & (dt1.country.isin(Eng_l)),'language']= 'English'
# Language will be transformed into one dummy variable: with 1 is English language and 0 is non English
dt1['english']= 0
dt1.loc[dt1.language=='English','english']= 1
dt1.loc[dt1.language.isnull(),'english']= np.NaN
dt1.english.unique()
country_l = ['USA','UK','France', np.nan]
dt1['country'] = np.where(~(dt1.country.isin(country_l)), 'others', dt1.country)
dt1.country.unique()
# Split genres rows into list of each value and then get its unique values.
# Create genre dummy variables by match each element of genre rows to new dummy variables.
import numpy as np
all_genres = []
for x in dt.genres:
all_genres.extend(x.split('|'))
genres = pd.unique(all_genres)
zero_matrix = np.zeros((len(dt), len(genres)))
dummies = pd.DataFrame(zero_matrix, columns=genres)
for i, gen in enumerate(dt.genres):
indices = dummies.columns.get_indexer(gen.split('|'))
dummies.iloc[i, indices] = 1
dummies.index = dt1.index
dt2 = dt1.join(dummies.add_prefix('genre_'))
dt2.loc[:, dt2.columns.str.startswith('genre_')].columns
genre_l= [i for i in dt2.loc[:, dt2.columns.str.startswith('genre_')].columns]
dt2[genre_l].sum().sort_values(ascending=False).plot.bar(figsize=(8,6), title='Most common genres in movie' )
# genre_Game_Show, genre_Reality_TV, genre_News, genre_Short, and genre_Film-Noir account for a small number of movies (< 6)
# we can drop them
dt2= dt2.drop(['genre_Game-Show', 'genre_Reality-TV', 'genre_News', 'genre_Short', 'genre_Film-Noir'],axis= 'columns')
Drama, comedy, thriller, and action are most common genres for movies.
drop_l= ['gross','movie_imdb_link','plot_keywords','director_name','actor_3_name','actor_2_name','actor_1_name','title_year',\
'movie_title','genres','language','cast_total_facebook_likes' ]
dt3= dt2.drop(drop_l,axis= 'columns')
For missing values
m= missing_values_table(dt3)
print('Number of rows before drop missing values:', dt3.shape[0])
dt4= dt3.dropna()
print('Number of rows after drop missing values:', dt4.shape[0])
dt5=pd.get_dummies(dt4)
dt5.columns
''' Dummy variables often cause multicolliner. To reduce the correlation among dummies variables, we can remove one feature
column from each type of variable'''
drop_l2= ['color_ Black and White','country_others','content_rating_others']
dt6= dt5.drop(drop_l2,axis= 'columns')
dt6.to_csv('cleaned_movie_rating.csv')
# Number of feature left
len(dt6.columns)
correlations = dt6.corr()
f,ax = plt.subplots(figsize=(15,15))
matrix = np.triu(correlations)
sns.heatmap(correlations,vmin=-1, vmax=1, linewidths=.5,center= 0,mask= matrix)
We will apply 3 ways and compare the results:
X= dt6.drop(['imdb_score'],axis= 1)
y= dt6['imdb_score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
''' Random forest importance '''
feat_labels= X_train.columns
forest= RandomForestRegressor(n_estimators= 500, random_state= 1)
forest.fit(X_train, y_train)
importance_rdf= forest.feature_importances_ # get importance values of each feature
indices_rdf= np.argsort(importance_rdf)
plt.figure(figsize=(8,8))
plt.barh(feat_labels[indices_rdf], importance_rdf[indices_rdf])
plt.xlabel("Random Forest Feature Importance")
''' Get top 25 importance features'''
importance_rdf[indices_rdf]
feat_labels[indices_rdf]
d1= {'feature':feat_labels[indices_rdf], 'importance':importance_rdf[indices_rdf]}
rdf_imp= pd.DataFrame(d1).sort_values(by='importance', ascending=False)
rdf_25= rdf_imp.iloc[0:25,0]
''' Permutation importance '''
from sklearn.inspection import permutation_importance
perm_importance = permutation_importance(forest, X_test, y_test)
indices_per = perm_importance.importances_mean.argsort()
plt.figure(figsize=(8,8))
plt.barh(feat_labels[indices_per], perm_importance.importances_mean[indices_per])
plt.xlabel("Permutation Importance")
''' Get top 25 importance features'''
feats_per = {}
for feature, importance in zip(feat_labels[indices_per], perm_importance.importances_mean[indices_per]):
feats_per[feature] = importance #add the name/value pair
permu_imp = pd.DataFrame(list(feats_per.items()),columns = ['feature','importance'])
permu_25= permu_imp.sort_values(['importance'], ascending=False).iloc[0:25,0]
''' Shap importance'''
import shap
importance = shap.TreeExplainer(forest)
shap_values = importance.shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type="bar", max_display=X_test.shape[1])
''' Get top 25 importance features'''
shap_vals= np.abs(shap_values).mean(0)
shap_fea_imp = pd.DataFrame(list(zip(X_train.columns,shap_vals)),columns=['col_name','feature_importance_vals'])
shap_fea_imp.sort_values(by=['feature_importance_vals'],ascending=False,inplace=True)
shap_25= shap_fea_imp.iloc[0:25,0]
''' Selected features'''
select_fea= list(set(rdf_25).union(set(permu_25), set(shap_25)))
select_fea
X1= dt6[select_fea]
y1= dt6['imdb_score']
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3, random_state=1)
import warnings
warnings.filterwarnings('ignore')
stdScaler= StandardScaler()
'''Extra Tree Regressor'''
ex_tree=ExtraTreesRegressor(random_state=1)
ex_tree_grid={'criterion': ['mse', 'friedman_mse', 'mae'],
'n_estimators':[100,200],
}
''' XGBoost '''
#xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=1)
xgb_model = xgb.XGBRegressor(random_state=1)
xgb_pipe= Pipeline([('scaler', stdScaler), ('xgboost', xgb_model)])
xgbgrid= {
'xgboost__objective':['reg:linear'],
'xgboost__learning_rate':[0.0001, 0.001, 0.01, 0.1, 1.0],
'xgboost__n_estimators':[500],
'xgboost__reg_lamda':[1e-5, 1e-2, 0.1, 1.0, 100.0],
'xgboost__reg_alpha':[1e-5, 1e-2, 0.1, 1.0, 100.0]
}
''' Random forest'''
rdforest = RandomForestRegressor(random_state=1)
#rdforest_pipe= Pipeline([('pca': PCA()), ('rdforest', rdforest)])
rdforest_grid = {'n_estimators': [500],
'max_features': ['log2', 'sqrt'],
'criterion': [ 'mse','mae']
}
'''Neural Network'''
neuralnet= MLPRegressor(random_state=1)
nnet_pipe = Pipeline([('scaler', stdScaler), ('neural_net', neuralnet)])
nnet_grid = {
'neural_net__solver': ['adam'],
'neural_net__max_iter': [500],
'neural_net__alpha': [0.0001, 0.001, 0.01, 0.1, 1.0],
'neural_net__hidden_layer_sizes':[(10,),(25,),(50,)],
'neural_net__learning_rate': ['adaptive']
}
# model name
modelnames= ['ex_tree','xgbreg','rdforest', 'neural_net']
#gridsearch list
gridList= [ex_tree_grid, xgbgrid,rdforest_grid, nnet_grid]
#List pipe
pipeList= [ex_tree, xgb_pipe, rdforest, nnet_pipe]
metrics= pd.DataFrame(data=None, columns= ['mae_train','mae_test','rmse_train','rmse_test','r2_train','r2_test'])
modelrs= pd.DataFrame(data=None, columns= ['model', 'y_train_pred','y_test_pred' ])
for pipe, grid, name in zip(pipeList, gridList, modelnames):
# create stratified kfold
k_fold = KFold(n_splits=5, random_state=1, shuffle=True).split(X1_train, y1_train)
gs= GridSearchCV(pipe, grid, scoring= 'r2', cv= k_fold, n_jobs=-1)
# Fit grid search
best_model = gs.fit(X1_train, y1_train)
# get parameter and best model
print('\nModel: ' , name,'\n')
print(best_model.best_params_)
y_train_pred= best_model.predict(X1_train)
y_test_pred= best_model.predict(X1_test)
# Metrics to evaluate the performance
mae_train= mean_absolute_error(y1_train, y_train_pred)
mae_test= mean_absolute_error(y1_test, y_test_pred)
rmse_train= mean_squared_error(y1_train, y_train_pred, squared=True)
rmse_test= mean_squared_error(y1_test, y_test_pred, squared=True)
r2_train= r2_score(y1_train, y_train_pred)
r2_test= r2_score(y1_test, y_test_pred)
rs= pd.Series({'mae_train': mae_train,'mae_test': mae_test,'rmse_train': rmse_train,'rmse_test': rmse_test,\
'r2_train': r2_train,'r2_test':r2_test }, name= name)
metrics= metrics.append(rs)
gr= pd.Series({'model': best_model.best_estimator_, 'y_train_pred':y_train_pred, 'y_test_pred':y_test_pred}, name= name)
modelrs = modelrs.append(gr)
''' Stacking model'''
ex_treebest= ExtraTreesRegressor(criterion='mse', n_estimators= 200, random_state=1)
xgboostbest= xgb.XGBRegressor(learning_rate= 0.1, n_estimators= 500,
objective='reg:squarederror',reg_alpha= 1.0,
reg_lamda= 1e-05,random_state=1)
rdforestbest= RandomForestRegressor(criterion= 'mse', max_features= 'sqrt', n_estimators=500, random_state=1)
neural_netbest= MLPRegressor(alpha= 1.0, hidden_layer_sizes= (50,),
learning_rate= 'adaptive', max_iter= 500,
solver='adam', random_state=1)
estimators = [('ex_tree', ex_treebest),('xgb', xgboostbest), ('forest', rdforestbest), ('nnet', neural_netbest)]
stackreg = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(n_estimators=20,random_state=1))
stackreg.fit(X1_train, y1_train)
y_train_pred= stackreg.predict(X1_train)
y_test_pred= stackreg.predict(X1_test)
# Metrics to evaluate the performance
mae_train= mean_absolute_error(y1_train, y_train_pred)
mae_test= mean_absolute_error(y1_test, y_test_pred)
rmse_train= mean_squared_error(y1_train, y_train_pred, squared=True)
rmse_test= mean_squared_error(y1_test, y_test_pred, squared=True)
r2_train= r2_score(y1_train, y_train_pred)
r2_test= r2_score(y1_test, y_test_pred)
rs= pd.Series({'mae_train': mae_train,'mae_test': mae_test,'rmse_train': rmse_train,'rmse_test': rmse_test,\
'r2_train': r2_train,'r2_test':r2_test }, name= 'stackreg')
metrics= metrics.append(rs)
gr= pd.Series({'model': best_model.best_estimator_, 'y_train_pred':y_train_pred, 'y_test_pred':y_test_pred}, name= 'stackreg')
modelrs = modelrs.append(gr)
''' Function to save models'''
modelnames1= ['ex_tree','xgbreg','rdforest', 'neural_net', 'stackreg']
from sklearn.externals import joblib
def savemodel(modeldf,colname,listmodel):
for i in listmodel:
joblib.dump(modeldf.loc[i][colname], i+'.pkl')
savemodel(modelrs,'model',modelnames1)
metrics.to_csv('metrics.csv')
pd.options.display.float_format = "{:,.4f}".format
metrics
''' Residuals plot '''
def residual_plt(modeldt, y_train, y_test):
for i in range(len(modeldt)):
plt.figure(figsize=(10,5))
plt.scatter(modeldt.iloc[i].y_train_pred, modeldt.iloc[i].y_train_pred - y_train,c='steelblue',edgecolor='white',marker='o',s=35,alpha=0.9,label='Training data')
plt.scatter(modeldt.iloc[i].y_test_pred, modeldt.iloc[i].y_test_pred - y_test, c='limegreen', edgecolor='white', marker='s', s=35, alpha=0.9, label='Test data')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='black')
plt.xlim([0, 15])
plt.title('Residuals plot of '+ modeldt.index[i] )
plt.show()
residual_plt(modelrs, y1_train, y1_test)
Overall, residuals of xgboost has better shape
''' Predicted values vs True values'''
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
indexmovie= [i for i in y1_test.index]
movie_name= dt2.loc[indexmovie,:].movie_title
dfrs={'movie': movie_name, 'y1_test': y1_test.values}
test_score= pd.DataFrame(dfrs, index= y1_test.index)
def predict_true_plt(test_df, model_df, metrics, movie_title):
'''
Scatter plot
data: pandas data frame
col_list: list of column name that want to be show on y_axis
target_col: column name of target variable that want to be show on x_axis
'''
md_names= metrics.index
for i in range(len(metrics)):
test_df['y1_predict']= model_df.iloc[i].y_test_pred
col= (test_df['y1_test'] - test_df['y1_predict']).abs()
dict_of_fig = dict({
"data": [{'y': test_df['y1_test'] ,
'x': test_df['y1_predict'] ,
'mode': 'markers', # marker= scatter. 'markers+text' la se hien thi text fix tren chart, not hover de hien
'marker': dict(size=8, color=col, colorbar= dict(title="Absolute residuals"), colorscale='Viridis', showscale=True),
#'text' : movie_name
'hovertext' : movie_title
}],
"layout": {'title': {'text': 'Imdb scores of ' + md_names[i] +
'.MAE:' + str(np.round(metrics.iloc[i,1],3)) +
'.RMSE:' + str(np.round(metrics.iloc[i,3],3)) +
'. R2:' + str(np.round(metrics.iloc[i,5],3))
},
'xaxis_title':{'text': 'Predicted scores' },
'yaxis_title':{'text': 'Actual score' }}
})
fig = go.Figure(dict_of_fig)
fig.update_layout(annotations=[
dict(x=test_df.loc[2295,].y1_predict ,
y=test_df.loc[2295,].y1_test,
showarrow=True,
text='Superbabies:Baby Geniuses 2. Pre:' +str(np.round(test_df.loc[2295,].y1_predict,1)) + ', True:'\
+str(np.round(test_df.loc[2295,].y1_test,1)),
xanchor='left',
yanchor='bottom'),
dict(x=test_df.loc[683,].y1_predict ,
y=test_df.loc[683,].y1_test,
showarrow=True,
text='Fight Club. Pre:' + str(np.round(test_df.loc[683,].y1_predict,1)) + ', True:'\
+str(np.round(test_df.loc[683,].y1_test,1))),
], autosize=True)
fig.show()
predict_true_plt(test_score,modelrs, metrics, movie_name)
In general, the prediction and true value forms a 45-degree line, which means their predicted and actual values are close. Movies such as Superbabies: Baby Geniuses 2 doesn’t have a good prediction, but movie such as Fight Club performs well in predicting the score. We will check how input variables affect to the output of Xgboost model.
''' How features affect to Xgboost results'''
xgb_model= xgb.XGBRegressor(learning_rate= 0.1, n_estimators= 500,
objective='reg:squarederror',reg_alpha= 1,
reg_lamda= 1e-05,random_state=1)
xgb_model.fit(X1_train, y1_train)
explainerxgb = shap.TreeExplainer(xgb_model)
shap_values_xgb = explainerxgb.shap_values(X1_test)
shap.summary_plot(shap_values_xgb, X1_test, max_display=X1_test.shape[1])
The order of y_axis indicates the importance of the features in the model, and how output changes when these feature changes its values:
!jupyter nbconvert ratingCopy1.ipynb --to html